#!/usr/bin/python3
# -*-coding:utf-8 -*-
from copyheaders import headers_raw_to_dict
from gne import GeneralNewsExtractor
from requests_html import HTMLSession
import pandas as pd
'''

ajax动态数据的爬取:
'''
str = b'''
referer: https://news.qq.com/
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36
'''
extraction=GeneralNewsExtractor()
session = HTMLSession()
headers = headers_raw_to_dict(str)
url = 'https://i.news.qq.com/trpc.qqnews_web.kv_srv.kv_srv_http_proxy/list?sub_srv_id=24hours&srv_id=pc&offset=0&limit=20&strategy=1&ext={%22pool%22:[%22top%22],%22is_filter%22:7,%22check_type%22:true}'


def get_second_handle(param):
    try:
        rr = session.get(url=param,headers=headers)
        print(rr.status_code,rr.encoding)
        result=extraction.extract(rr.content.decode('gbk'))
        df=pd.DataFrame(result)
        print(df)
    except Exception as e:
        print('Error: %s' % e)


def main():
    r = session.get(url=url, headers=headers)
    print(r.status_code, r.encoding)
    html_info = r.json()['data']['list']
    for info in html_info:
        print(info['title'], info['url'])
        get_second_handle(info['url'])


if __name__ == "__main__":
    main()
